WebDBReader.java example

Explorer

damp.ekeko.snippets-master
- damp.ekeko.snippets.plugin
  - src
    - damp
      - ekeko
        snippets
        BoundDirective.java
        DirectiveOperandBinding.java
        EkekoSnippetsPlugin.java
        ExtractedSnippet.java
        NaiveASTFlattener.java
        OperatorOperandBinding.java
        SnippetBaseListener.java
        SnippetBaseVisitor.java
        SnippetExtractor.java
        SnippetLexer.java
        SnippetListener.java
        SnippetParser.java
        SnippetVisitor.java
        data
        SnippetOperator.java
        TemplateGroup.java
        geneticsearch
        PartialJavaProjectModel.java
        gui
        BoundDirectivesEditorDialog.java
        BoundDirectivesViewer.java
        ChartCanvas.java
        ClojureFileEditorInput.java
        DirectiveOperandBindingEditingSupport.java
        DirectiveOperandBindingLabelProviderValue.java
        DirectiveSelectionDialog.java
        IntendedResultsEditor.java
        IntendedResultsEditorCommandHandler.java
        IntendedResultsEditorInput.java
        IntendedResultsEditorPersistableElementFactory.java
        MutationHistoryDialog.java
        OperandBindingLabelProviderDescription.java
        OperatorOperandBindingEditingSupport.java
        OperatorOperandBindingLabelProviderValue.java
        OperatorOperandsView.java
        OperatorOperandsViewer.java
        OperatorTreeContentProvider.java
        OperatorTreeLabelProvider.java
        PopulationInspectorDialog.java
        QueryInspectorDialog.java
        RecommendationEditor.java
        RecommendationEditorCommandHandler.java
        RecommendationEditorInput.java
        RecommendationEditorPersistableElementFactory.java
        RewritesTemplateEditor.java
        SubjectsTemplateEditor.java
        TemplateCodeGenerator.java
        TemplateEditor.java
        TemplateEditorActionBarContributor.java
        TemplateEditorCommandHandler.java
        TemplateEditorInput.java
        TemplateEditorPersistableElementFactory.java
        TemplateGroupNodeSelectionDialog.java
        TemplateGroupTemplateElement.java
        TemplateGroupViewer.java
        TemplateGroupViewerNodeDoubleClickListener.java
        TemplateGroupViewerNodeSelectionEvent.java
        TemplateGroupViewerNodeSelectionListener.java
        TemplatePrettyPrinter.java
        TemplateTreeContentProvider.java
        TemplateTreeLabelProviders.java
        TransformationEditor.java
        TransformationEditorActionBarContributor.java
        TransformationEditorCommandHandler.java
        TransformationEditorInput.java
        TransformationEditorPersistableElementFactory.java
        TransformationOverviewEditor.java
    - ec
      - util
        MersenneTwister.java
- damp.ekeko.snippets.plugin.test
  - resources
  - src
    - test
      - damp
        ekeko
        snippets
        EkekoSnippetsTest.java
        experiments
        GeneticSearchTest.java

/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.db;

import java.io.*;
import java.util.*;
import java.nio.channels.*;

import net.nutch.io.*;
import net.nutch.pagedb.*;
import net.nutch.linkdb.*;

/**********************************************
 * The WebDBReader implements all the read-only
 * parts of accessing our web database.
 * All the writing ones can be found in WebDBWriter.
 *
 * @author Mike Cafarella
 **********************************************/
public class WebDBReader implements IWebDBReader {
    static final Page[] PAGE_RECORDS = new Page[0];
    static final Link[] LINK_RECORDS = new Link[0];

    // filenames
    static final String PAGES_BY_URL = "pagesByURL";
    static final String PAGES_BY_MD5 = "pagesByMD5";
    static final String LINKS_BY_URL = "linksByURL";
    static final String LINKS_BY_MD5 = "linksByMD5";
    static final String STATS_FILE = "stats";

    File dbFile;
    MapFile.Reader pagesByURL, pagesByMD5, linksByURL, linksByMD5;
    long totalPages = 0, totalLinks = 0;
    Vector mapReaders = null, setReaders = null;
    FileInputStream dbReadLockData;
    FileLock dbReadLock;

    /**
     * Open a web db reader for the named directory.
     */    
    public WebDBReader(File dir) throws IOException, FileNotFoundException {
        this.dbFile = new File(dir, "webdb");

        // Obtain read lock on db so writers don't try to 
        // move it out from under us.  This obtains a non-exclusive
        // lock on the directory that holds the dbs (old and new)
        File readLockFile = new File(dir, "dbreadlock");
        readLockFile.createNewFile();
        this.dbReadLockData = new FileInputStream(readLockFile);
        this.dbReadLock = dbReadLockData.getChannel().lock(0L, Long.MAX_VALUE, true);

        // Create tables
        this.pagesByURL = new MapFile.Reader(new File(dbFile, PAGES_BY_URL).getPath(), new UTF8.Comparator());
        this.pagesByMD5 = new MapFile.Reader(new File(dbFile, PAGES_BY_MD5).getPath(), new Page.Comparator());

        this.linksByURL = new MapFile.Reader(new File(dbFile, LINKS_BY_URL).getPath(), new Link.UrlComparator());
        this.linksByMD5 = new MapFile.Reader(new File(dbFile, LINKS_BY_MD5).getPath(), new Link.MD5Comparator());

        // Load in statistics
        File stats = new File(dbFile, STATS_FILE);
        if (stats.exists()) {
            DataInputStream in = new DataInputStream(new FileInputStream(stats));
            try {
                int version = (byte) in.read();
                this.totalPages = in.readLong();
                this.totalLinks = in.readLong();
            } finally {
                in.close();
            }
        }

        // Create vectors so we can GC readers used by 
        // enum() calls.  We do this so we can have multiple
        // simultaneous enum users.  However, since we keep
        // a handle to each one, we're assuming that we don't
        // create too many before WebDBReader.close() is called.
        this.mapReaders = new Vector();
        this.setReaders = new Vector();
    }

    /**
     * Shutdown
     */
    public void close() throws IOException {
        pagesByURL.close();
        pagesByMD5.close();
        linksByURL.close();
        linksByMD5.close();

        for (Enumeration e = mapReaders.elements(); e.hasMoreElements(); ) {
            MapFile.Reader tmp = (MapFile.Reader) e.nextElement();
            tmp.close();
        }
        for (Enumeration e = setReaders.elements(); e.hasMoreElements(); ) {
            SetFile.Reader tmp = (SetFile.Reader) e.nextElement();
            tmp.close();
        }

        // release the lock
        dbReadLock.release();
        dbReadLockData.close();
    }

    /**
     * Get Page from the pagedb with the given URL
     */
    public Page getPage(String url) throws IOException {
        return (Page) pagesByURL.get(new UTF8(url), new Page());
    }

    /**
     * Get Pages from the pagedb according to their
     * content hash.
     */
    public Page[] getPages(MD5Hash md5) throws IOException {
        Vector records = new Vector(3);
        Page p = new Page();
        p.getMD5().set(md5);

        pagesByMD5.seek(p);
        while (pagesByMD5.next(p, NullWritable.get())) {
            if (p.getMD5().compareTo(md5) == 0) {
                records.add(p);
                p = new Page();
            } else {
                break;
            }
        }

        // Xfer from the vector into an array
        return (Page[]) records.toArray(PAGE_RECORDS);
    }

    /**
     * Test whether a certain piece of content is in the
     * database, but don't bother returning the Page(s) itself.
     */
    public boolean pageExists(MD5Hash md5) throws IOException {
        Page p = new Page();
        p.getMD5().set(md5);
        pagesByMD5.seek(p);
        if (pagesByMD5.next(p, NullWritable.get()) && p.getMD5().compareTo(md5) == 0) {
            return true;
        } else {
            return false;
        }
    }

    /**
     * Iterate through all the Pages, sorted by URL
     */
    public Enumeration pages() throws IOException {
        MapFile.Reader tmpReader = new MapFile.Reader(new File(dbFile, "pagesByURL").getPath());
        mapReaders.add(tmpReader);
        return new TableEnumerator(tmpReader);
    }

    //
    // The TableEnumerator goes through all the entries
    // in the Table (which is a MapFile).
    //
    class TableEnumerator implements Enumeration {
        MapFile.Reader reader;
        Page nextItem;

        /**
         * Start the cursor and find the first item.
         * Store it for later return.
         */
        public TableEnumerator(MapFile.Reader reader) {
            this.reader = reader;
            this.nextItem = new Page();
            try {
                if (! reader.next(new UTF8(), this.nextItem)) {
                    this.nextItem = null;
                }
            } catch (IOException ie) {
                this.nextItem = null;
            }
        }

        /**
         * If there's no item left in store, we've hit the end.
         */
        public boolean hasMoreElements() {
            return (nextItem != null);
        }

        /**
         * Set aside the item we have in store.  Then retrieve
         * another for the next time we're called.  Finally, return
         * the set-aside item.
         */
        public Object nextElement() {
            if (nextItem == null) {
                throw new NoSuchElementException("PageDB Enumeration");
            }
            Page toReturn = nextItem;
            this.nextItem = new Page();
            try {
                if (! reader.next(new UTF8(), nextItem)) {
                    this.nextItem = null;
                }
            } catch (IOException ie) {
                this.nextItem = null;
            }
            return toReturn;
        }
    }


    /**
     * Iterate through all the Pages, sorted by MD5
     */
    public Enumeration pagesByMD5() throws IOException {
        SetFile.Reader tmpReader = new SetFile.Reader(new File(dbFile, "pagesByMD5").getPath());
        setReaders.add(tmpReader);
        return new IndexEnumerator(tmpReader);
    }

    /**
     * Return the number of pages we're dealing with
     */
    public long numPages() {
        return totalPages;
    }

    //
    // The IndexEnumerator goes through all the entries
    // in the index (which is a SequenceFile).
    //
    class IndexEnumerator implements Enumeration {
        SetFile.Reader reader;
        Page nextItem;

        /**
         * Start the cursor and find the first item.
         * Store it for later return.
         */
        public IndexEnumerator(SetFile.Reader reader) {
            this.reader = reader;
            this.nextItem = new Page();
            try {
                if (! reader.next(nextItem)) {
                    this.nextItem = null;
                }
            } catch (IOException ie) {
                this.nextItem = null;
            }
        }

        /**
         * If there's no item left in store, we've hit the end.
         */
        public boolean hasMoreElements() {
            return (nextItem != null);
        }

        /**
         * Set aside the item we have in store.  Then retrieve
         * another for the next time we're called.  Finally, return
         * the set-aside item.
         */
        public Object nextElement() {
            if (nextItem == null) {
                throw new NoSuchElementException("PageDB Enumeration");
            }

            Page toReturn = nextItem;
            this.nextItem = new Page();
            try {
                if (! reader.next(nextItem)) {
                    this.nextItem = null;
                }
            } catch (IOException ie) {
                this.nextItem = null;
            }
            return toReturn;
        }
    }

    /**
     * Get all the hyperlinks that link TO the indicated URL.
     */     
    public Link[] getLinks(UTF8 url) throws IOException {
        Vector records = new Vector(3);
        Link l = new Link();
        l.getURL().set(url);

        linksByURL.seek(l);
        while (linksByURL.next(l, NullWritable.get())) {
            if (url.equals(l.getURL())) {
                records.add(l);
                l = new Link();
            } else {
                break;
            }
        }
        
        // Xfer from the vector into an array
        return (Link[]) records.toArray(LINK_RECORDS);
    }

    /**
     * Grab all the links from the given MD5 hash.
     */
    public Link[] getLinks(MD5Hash md5) throws IOException {
        Vector records = new Vector(3);
        Link l = new Link();
        l.getFromID().set(md5);

        linksByMD5.seek(l);
        while (linksByMD5.next(l, NullWritable.get())) {
            if (md5.equals(l.getFromID())) {
                records.add(l);
                l = new Link();
            } else {
                break;
            }
        }
        
        // Xfer from the vector into an array
        return (Link[]) records.toArray(LINK_RECORDS);
    }

    /**
     * Return all the links, by target URL
     */
    public Enumeration links() {
        return new MapEnumerator(linksByURL);
    }

    /**
     * Return the number of links in our db.
     */
    public long numLinks() {
        return totalLinks;
    }

    //
    // Here's the class for the above function
    //
    class MapEnumerator implements Enumeration {
        MapFile.Reader reader;
        Link nextItem;

        /**
         * Start the cursor and find the first item.
         * Store it for later return.
         */
        public MapEnumerator(MapFile.Reader reader) {
            this.reader = reader;
            this.nextItem = new Link();
            try {
                if (! reader.next(this.nextItem, NullWritable.get())) {
                    this.nextItem = null;
                }
            } catch (IOException ie) {
                this.nextItem = null;
            }
        }

        /**
         * If there's no item left in store, we've hit the end.
         */
        public boolean hasMoreElements() {
            return (nextItem != null);
        }

        /**
         * Set aside the item we have in store.  Then retrieve
         * another for the next time we're called.  Finally, return
         * the set-aside item.
         */
        public Object nextElement() {
            if (nextItem == null) {
                throw new NoSuchElementException("PageDB Enumeration");
            }

            Link toReturn = nextItem;
            this.nextItem = new Link();
            try {
                if (! reader.next(nextItem, NullWritable.get())) {
                    this.nextItem = null;
                }
            } catch (IOException ie) {
                this.nextItem = null;
            }
            return toReturn;
        }
    }

    /**
     * The WebDBReader.main() provides some handy utility methods
     * for looking through the contents of the webdb.  Hoo-boy!
     */
    public static void main(String argv[]) throws FileNotFoundException, IOException {
        if (argv.length < 2) {
            System.out.println("Usage: java net.nutch.db.WebDBReader <db> [-pageurl url] | [-pagemd5 md5] | [-dumppageurl] | [-dumppagemd5] | [-toppages <k>] | [-linkurl url] | [-linkmd5 md5] | [-dumplinks] | [-stats]");
            return;

        }

        WebDBReader reader = new WebDBReader(new File(argv[0]));
        try {
            if ("-pageurl".equals(argv[1])) {
                String url = argv[2];
                System.out.println(reader.getPage(url.trim()));
            } else if ("-pagemd5".equals(argv[1])) {
                MD5Hash md5 = new MD5Hash(argv[2]);
                Page pages[] = reader.getPages(md5);
                System.out.println("Found " + pages.length + " pages.");
                for (int i = 0; i < pages.length; i++) {
                    System.out.println("Page " + i + ": " + pages[i]);
                }
            } else if ("-dumppageurl".equals(argv[1])) {
                System.out.println(reader);
                System.out.println();
                int i = 1;
                for (Enumeration e = reader.pages(); e.hasMoreElements(); i++) {
                    Page page = (Page) e.nextElement();
                    System.out.println("Page " + i + ": " + page);
                    System.out.println();
                }
            } else if ("-dumppagemd5".equals(argv[1])) {
                System.out.println(reader);
                System.out.println();
                int i = 1;
                for (Enumeration e = reader.pagesByMD5(); e.hasMoreElements(); i++) {
                    Page page = (Page) e.nextElement();
                    System.out.println("Page " + i + ": " + page);
                    System.out.println();
                }
            } else if ("-toppages".equals(argv[1])) {
                int topSize = Integer.parseInt(argv[2]);

                // Create a sorted list
                SortedSet topSet = new TreeSet(new Comparator() {
                    public int compare(Object o1, Object o2) {
                        Page p1 = (Page) o1;
                        Page p2 = (Page) o2;
                        if (p1.getScore() < p2.getScore()) {
                            return -1;
                        } else if (p1.getScore() == p2.getScore()) {
                            // If two scores are equal, we will
                            // use regular Page comparison (which
                            // uses URL as the primary key).  We
                            // don't want to uniquify by score!
                            return p1.compareTo(p2);
                        } else {
                            return 1;
                        }
                    }
                }
                    );

                // Find the top "topSize" elts
                Page lowestPage = null;
                for (Enumeration e = reader.pages(); e.hasMoreElements(); ) {
                    Page curPage = (Page) e.nextElement();
                    if (topSet.size() < topSize) {
                        topSet.add(curPage);
                        lowestPage = (Page) topSet.first();
                    } else if (lowestPage.getScore() < curPage.getScore()) {
                        topSet.remove(lowestPage);
                        topSet.add(curPage);
                        lowestPage = (Page) topSet.first();
                    }
                }
            
                // Print them out
                int i = 0;
                for (Iterator it = topSet.iterator(); it.hasNext(); i++) {
                    System.out.println("Page " + i + ": " + (Page) it.next());
                    System.out.println();
                }
            } else if ("-linkurl".equals(argv[1])) {
                String url = argv[2];
                Link links[] = reader.getLinks(new UTF8(url.trim()));
                System.out.println("Found " + links.length + " links.");
                for (int i = 0; i < links.length; i++) {
                    System.out.println("Link " + i + ": " + links[i]);
                }
            } else if ("-linkmd5".equals(argv[1])) {
                MD5Hash fromID = new MD5Hash(argv[2]);
                Link links[] = reader.getLinks(fromID);
                System.out.println("Found " + links.length + " links.");
                for (int i = 0; i < links.length; i++) {
                    System.out.println("Link " + i + ": " + links[i]);
                }
            } else if ("-dumplinks".equals(argv[1])) {
                System.out.println(reader);
                System.out.println();
                Enumeration e = reader.pagesByMD5();
                while (e.hasMoreElements()) {
                  Page page = (Page) e.nextElement();
                  Link[] links = reader.getLinks(page.getMD5());
                  if (links.length > 0) {
                    System.out.println("from " + page.getURL());
                    for (int i = 0; i < links.length; i++) {
                      System.out.println(" to " + links[i].getURL());
                    }
                    System.out.println();
                  }
                }
            } else if ("-stats".equals(argv[1])) {
                System.out.println("Stats for " + reader);
                System.out.println("-------------------------------");
                System.out.println("Number of pages: " + reader.numPages());
                System.out.println("Number of links: " + reader.numLinks());
            } else {
                System.out.println("Sorry, no command with name " + argv[1]);
            }
        } finally {
            reader.close();
        }
    }
}